[133]:
using GraphMakie
using CSV
using DataFrames
using DelimitedFiles
using ColorSchemes
using Plots
using PlotThemes
using SankeyPlots
using Tables
using StatsPlots
using StatsBase
using LinearAlgebra
using DataFrames

theme(:ggplot2, palette=:Pastel2_8)
[134]:
unwrapped_complex_names = vec(readdlm("data/complex_names.txt", '\t', String, '\n'))
unwrapped_monomer_names = vec(readdlm("data/protein_names.txt", '\t', String, '\n'))
unwrapped_cofactor_names = vec(readdlm("data/cofactor_names.txt", '\t', String, '\n'))
unwrapped_element_names = vec(readdlm("data/element_ids.txt", '\t', String, '\n'))
unwrapped_pathway_names = vec(readdlm("data/pathway_names.txt", '\t', String, '\n'))
unwrapped_complex_names = vec(readdlm("data/complex_names.txt", '\t', String, '\n'))
unwrapped_monomer_names = vec(readdlm("data/protein_names.txt", '\t', String, '\n'))
unwrapped_cofactor_names = vec(readdlm("data/cofactor_names.txt", '\t', String, '\n'))
unwrapped_element_names = vec(readdlm("data/element_ids.txt", '\t', String, '\n'))
unwrapped_pathway_names = vec(readdlm("data/pathway_names.txt", '\t', String, '\n'))

complex_ids = vec(readdlm("data/complex_ids.txt", '\t', String, '\n'))
monomer_ids = vec(readdlm("data/protein_ids.txt", '\t', String, '\n'))
cofactor_ids = vec(readdlm("data/cofactor_ids.txt", '\t', String, '\n'))
element_ids = vec(readdlm("data/element_ids.txt", '\t', String, '\n'))
pathway_ids = vec(readdlm("data/pathway_ids.txt", '\t', String, '\n'))
aa_ids = vec(readdlm("data/amino_acid_ids.txt", '\t', String, '\n'))


unwrapped_protein_names = [unwrapped_complex_names; unwrapped_monomer_names]
protein_ids = [complex_ids; monomer_ids]

C = Matrix(DataFrame(CSV.File("data/C_matrix.csv", header=false)))
P = Matrix(DataFrame(CSV.File("data/P_matrix.csv", header=false)))
E = Matrix(DataFrame(CSV.File("data/E_matrix.csv", header=false)))
W = Matrix(DataFrame(CSV.File("data/W_matrix.csv", header=false)))
W2 = Matrix(DataFrame(CSV.File("data/W2_matrix.csv", header=false)))
W1 = Matrix(DataFrame(CSV.File("data/W1_matrix.csv", header=false)))

A = Matrix(DataFrame(CSV.File("data/A_matrix.csv", header=false)))

Tree = Matrix(DataFrame(CSV.File("data/tree_matrix.csv")))

total_counts_min = Matrix(DataFrame(CSV.File("data/counts.csv", header=false)))
total_counts = Matrix(DataFrame(CSV.File("data/rich_counts.csv", header=false)))
total_counts_anaero = Matrix(DataFrame(CSV.File("data/anaerobic_counts.csv", header=false)))
# total_counts = Matrix(DataFrame(CSV.File("data/counts.csv", header=false)))
total_counts_big = Matrix(DataFrame(CSV.File("data/rich_counts_big.csv", header=false)))
total_counts_big = total_counts_big[:, 2:size(total_counts_big)[2] - 1]

monomer_masses = vec(Matrix(DataFrame(CSV.File("data/monomer_masses.csv", header=false))))
monomer_areas = vec(Matrix(DataFrame(CSV.File("data/monomer_areas.csv", header=false))))
complex_areas = vec(Matrix(DataFrame(CSV.File("data/complex_areas.csv", header=false))))
protein_areas = [complex_areas; monomer_areas]
complex_ids = vec(readdlm("data/complex_ids.txt", '\t', String, '\n'))
monomer_ids = vec(readdlm("data/protein_ids.txt", '\t', String, '\n'))
cofactor_ids = vec(readdlm("data/cofactor_ids.txt", '\t', String, '\n'))
element_ids = vec(readdlm("data/element_ids.txt", '\t', String, '\n'))
pathway_ids = vec(readdlm("data/pathway_ids.txt", '\t', String, '\n'))
aa_ids = vec(readdlm("data/amino_acid_ids.txt", '\t', String, '\n'))


unwrapped_protein_names = [unwrapped_complex_names; unwrapped_monomer_names]
protein_ids = [complex_ids; monomer_ids]

C = Matrix(DataFrame(CSV.File("data/C_matrix.csv", header=false)))
P = Matrix(DataFrame(CSV.File("data/P_matrix.csv", header=false)))
E = Matrix(DataFrame(CSV.File("data/E_matrix.csv", header=false)))
W = Matrix(DataFrame(CSV.File("data/W_matrix.csv", header=false)))
W2 = Matrix(DataFrame(CSV.File("data/W2_matrix.csv", header=false)))
W1 = Matrix(DataFrame(CSV.File("data/W1_matrix.csv", header=false)))

A = Matrix(DataFrame(CSV.File("data/A_matrix.csv", header=false)))

Tree = Matrix(DataFrame(CSV.File("data/tree_matrix.csv")))

total_counts_min = Matrix(DataFrame(CSV.File("data/counts.csv", header=false)))
total_counts = Matrix(DataFrame(CSV.File("data/rich_counts.csv", header=false)))
total_counts_anaero = Matrix(DataFrame(CSV.File("data/anaerobic_counts.csv", header=false)))
# total_counts = Matrix(DataFrame(CSV.File("data/counts.csv", header=false)))
total_counts_big = Matrix(DataFrame(CSV.File("data/rich_counts_big.csv", header=false)))
total_counts_big = total_counts_big[:, 2:size(total_counts_big)[2] - 1]

monomer_masses = vec(Matrix(DataFrame(CSV.File("data/monomer_masses.csv", header=false))))
monomer_areas = vec(Matrix(DataFrame(CSV.File("data/monomer_areas.csv", header=false))))
complex_areas = vec(Matrix(DataFrame(CSV.File("data/complex_areas.csv", header=false))))
protein_areas = [complex_areas; monomer_areas]


tim_export = total_counts * C * P * E
CSV.write("data/element_sim.csv",  Tables.table(tim_export), writeheader=true, header=unwrapped_element_names)

# dismutase_idx = findfirst(isequal("SUPEROX-DISMUTFE-CPLX"), protein_ids)
# total_counts[:, dismutase_idx] .= 15000

# nadh_dehyd_idx = findfirst(isequal("NADH-DHI-CPLX"), protein_ids)
# total_counts[:, nadh_dehyd_idx, :] .= 400

# succ_quinone_idx = findfirst(isequal("CPLX0-8160"), protein_ids)
# total_counts[:, succ_quinone_idx, :] .= 1000

# temp_remove = ["EG11415-MONOMER", "EG12132-MONOMER", "CPLX0-7908",
#     "EG12332-MONOMER", "CPLX0-7617", "EG11378-MONOMER", "CPLX0-7824",
#     "G7748-MONOMER", "CPLX0-7682"]

# for item in temp_remove
#     cur_idx = findfirst(isequal(item), protein_ids)
#     total_counts[:, cur_idx] .= 0
# end

dps_idx = findfirst(isequal("EG11415-MONOMER"), monomer_ids)
P[dps_idx, :] .= 0

counts_min = vec(total_counts_min[1100, :])
counts = vec(total_counts[600, :])
counts_anaero = vec(total_counts_anaero[2500, :])

total_counts_forecast = deepcopy(total_counts_anaero)

narg_idx = findfirst(isequal("NITRATREDUCTA-CPLX"), protein_ids) #x1000
total_counts_forecast[:, narg_idx] .= 10000

form_idx = findfirst(isequal("FORMATEDEHYDROGN-CPLX"), protein_ids) # x300
total_counts_forecast[:, form_idx] .= 1000

hyc_idx = findfirst(isequal("FHLMULTI-CPLX"), protein_ids) #x50
total_counts_forecast[:, hyc_idx] .= 1000

total_counts_forecast_sox = deepcopy(total_counts)
sox_idx = findfirst(isequal("SUPEROX-DISMUTMN-CPLX"), protein_ids) #x10
total_counts_forecast_sox[:, sox_idx] .*= 10

# correction

[134]:
1301-element view(::Matrix{Int64}, :, 1049) with eltype Int64:
 179660
 179660
 179720
 179790
 179830
 179880
 179940
 180030
 180060
 180100
 180150
 180220
 180290
      ⋮
 316010
 316160
 316270
 316380
 316500
 316680
 316780
 316890
 317020
 317110
 317220
 317360
[ ]:

[135]:
function wrap_text(text::String, char_limit::Int)
    # Split the text into words
    words = split(text)
    wrapped_text = ""
    line_length = 0

    for word in words
        if line_length + length(word) > char_limit
            wrapped_text *= "\n"
            line_length = 0
        elseif wrapped_text != ""
            wrapped_text *= " "
        end
        wrapped_text *= word
        line_length += length(word) + 1  # +1 for the space
    end

    return wrapped_text
end

# Example usage:
char_limit = 30  # Insert a newline after 10 characters

# Wrap each category label
complex_names = [wrap_text(item, char_limit) for item in unwrapped_complex_names]
monomer_names = [wrap_text(item, char_limit) for item in unwrapped_monomer_names]
cofactor_names = [wrap_text(item, char_limit) for item in unwrapped_cofactor_names]
element_names = [wrap_text(item, char_limit) for item in unwrapped_element_names]
pathway_names = [wrap_text(item, char_limit) for item in unwrapped_pathway_names]
protein_names = [wrap_text(item, char_limit) for item in unwrapped_protein_names]
[135]:
5527-element Vector{String}:
 "1-phosphofructokinase"
 "2-oxoglutarate dehydrogenase\ncomplex"
 "3-isopropylmalate\ndehydrogenase"
 "3-isopropylmalate dehydratase"
 "3-methyl-2-oxobutanoate\nhydroxymethyltransferase"
 "3-oxoacyl-[acyl carrier\nprotein] synthase 2"
 "6-phosphofructokinase 1"
 "6-phosphofructokinase 2"
 "6-phosphogluconate\ndehydrogenase, decarboxylating"
 "7-α-hydroxysteroid\ndehydrogenase"
 "8-amino-7-oxononanoate\nsynthase"
 "ferric enterobactin ABC\ntransporter"
 "iron(III) hydroxamate ABC\ntransporter"
 ⋮
 "putative ABC transporter\nmembrane subunit YphD"
 "putative ABC transporter\nATP-binding protein YphE"
 "DnaA initiator-associating\nprotein DiaA"
 "intermembrane phospholipid\ntransport system, integral\nmembrane subunit MlaE"
 "intermembrane phospholipid\ntransport system, ATP binding\nsubunit MlaF"
 "putative transport protein\nYrbG"
 "galactofuranose ABC\ntransporter periplasmic\nbinding protein"
 "galactofuranose ABC\ntransporter putative ATP\nbinding subunit"
 "galactofuranose ABC\ntransporter putative membrane\nsubunit YtfT"
 "Zn<sup>2+</sup> ABC\ntransporter periplasmic\nbinding protein"
 "Zn<sup>2+</sup> ABC\ntransporter membrane subunit"
 "Zn<sup>2+</sup> ABC\ntransporter ATP binding\nsubunit"
[136]:
findfirst(isequal("NADH-DHI-CPLX"), protein_ids)
[136]:
955
[137]:
cur_idx = findfirst(isequal("PYRUVATEDEH-CPLX"), protein_ids)
counts[cur_idx]
[137]:
272
[138]:
cur_idx = findfirst(isequal("EG10889-MONOMER"), protein_ids)
counts[cur_idx]
[138]:
16466
[139]:
cur_idx = findfirst(isequal("CPLX0-3962"), protein_ids)
counts[cur_idx]
[139]:
56656

Create Sankey diagram of where the iron flows in the cell.

[140]:
cur_elements = ["FE"]
cap_entries = 40

# create counts matrices
C_P_counts = C .* repeat(counts , 1, length(monomer_names))
# C_E_counts = C_P_counts * P * E
C_O_counts = C_P_counts * P
C_E_counts = C_P_counts * P * E
cofactor_counts = vec(sum(C_O_counts, dims=1))
O_E_counts =  E .* repeat(cofactor_counts, 1, length(element_names))


# we have to normalize the array because C is not norm-preserving.
C_norm = Array{Float64}(copy(C))

for i in 1:size(C)[2]
    C_norm[i, :] = C[i, :] / (sum(C[i, :]) + 0.00001)
end

C_P_norm_counts = C .* repeat(counts, 1, length(monomer_names))
C_W_norm_counts_cats = C_P_norm_counts * W * W2

# creating element indices of interest
element_idxs = [element in cur_elements for element in element_names]
element_idxs_nz = findall(vec(element_idxs))

# create cofactor indices of interest
cofactor_idxs = vec(E[:, element_idxs] .> 0)
cofactor_idxs_nz = findall(vec(cofactor_idxs))

## cplx sorted by counts of elements
capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

capped_C_O = C_O_counts[capped_complex_idx, cofactor_idxs]
capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]
capped_O_E = O_E_counts[cofactor_idxs, element_idxs]


# corrective factor multiplies a cplx by its cofactor stoichiometry, needed for classes
corrective_factor = sum((C * P * E)[capped_complex_idx, element_idxs], dims=2)
cofactor_element_corrective_factor = vec(E[cofactor_idxs, element_idxs])
cplx_element_norm = vec(capped_C_O * vec(E[cofactor_idxs, element_idxs]))
cplx_class_norm = vec(sum(capped_C_W, dims=2))

# create "remaining cofactors"
total_cofactor_counts = vec(sum(C_O_counts[:, cofactor_idxs], dims=1))
in_use_cofactor_counts = vec(sum(capped_C_O, dims=1))
remaining_cofactor_counts = vec(total_cofactor_counts - in_use_cofactor_counts)

# create "no class cplx", just use C_O counts
unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
unclassified_cplx_counts = vec(capped_C_O * cofactor_element_corrective_factor)

# create sizes
n_cplx = length(capped_complex_idx)
n_classes = length(capped_classes)
n_cofactors = length(cofactor_idxs_nz)
n_elements = length(cur_elements)

# initialize arrays
src = Vector{Int64}()
dst = Vector{Int64}()
weights = Vector{Int64}()

element_name = element_names[element_idxs][1]
n_atoms = sum(capped_O_E)

# create labels
node_labels = ["$element_name $n_atoms atoms";
                cofactor_names[cofactor_idxs];
                protein_names[capped_complex_idx];
                pathway_names[capped_classes];
                ["$c, \n other proteins" for c  cofactor_names[cofactor_idxs]];
                "Uncategorized"
]

# create colors
colors = zeros(Int64, length(node_labels))

cofactor_colors = collect(1:(n_cofactors))
other_color = 0

colors[1] = 1
colors[2:(n_cofactors+1)] = cofactor_colors

# create sorting
## sorts elements
ordering_cofactors = Vector{Pair{Int64, Int64}}()
# sort_cofactors = sortperm(vec(sum(capped_C_O, dims=1)) .* cofactor_element_corrective_factor, rev=true)
# for i in 1:(n_cofactors-1)
#     push!(ordering_cofactors, (n_elements + sort_cofactors[i])=> (n_elements + sort_cofactors[i+1]))
# end


# order of index assignments
# elements, cplx, classes, remaining elements, no class label

# chart progress: E -> O -> C -> W
# E -> O
for i in 1:n_elements
    for j in 1:n_cofactors
        if capped_O_E[j, i] != 0

            push!(src, i)
            push!(dst, n_elements + j)
            push!(weights, capped_O_E[j, i])

            # colors[n_elements + j] = colors[i]

        end
    end
end

# E -> O, reverse order because im a dumbass
for i in 1:n_cofactors
    for j in 1:n_cplx
        if capped_C_O[j, i] != 0

            push!(src, n_elements + i)
            push!(dst, n_elements + n_cofactors + j)
            push!(weights, trunc(Int, capped_C_O[j, i] * cofactor_element_corrective_factor[i]))

            colors[n_elements + n_cofactors + j] = colors[n_elements + i]

        end
    end
end

# # C -> W
for i in 1:n_cplx
    for j in 1:n_classes
        if capped_C_W[i, j] != 0

            # corrective factor multiplies by element multiplicity to maintain flow size

            push!(src, n_elements + n_cofactors + i)
            push!(dst, n_elements + n_cofactors  + n_cplx + j)
            push!(weights, trunc(Int64, trunc(Int, capped_C_W[i, j] * cplx_element_norm[i]/cplx_class_norm[i])))  # just corrects to sum of previous

            # colors[n_elements + n_cofactors + n_cplx + j] = colors[n_elements + n_cofactors + i]
            colors[n_elements + n_cofactors + n_cplx + j] = colors[1]

        end
    end

#     if unclassified_cplx_idxs[i]
#         push!(src, n_elements + n_cofactors + i)
#         push!(dst, n_elements + n_cofactors + n_cplx + n_classes + n_elements + 1)
#         push!(weights, unclassified_cplx_counts[i])

#         colors[n_elements + n_cofactors + n_cplx + n_classes + n_elements + 1] = colors[n_elements + n_cofactors + i]
#     end

end

# O -> C remaining cnts
for i in 1:n_cofactors
    push!(src, n_elements + i)
    push!(dst, n_elements + n_cofactors + n_cplx + n_classes + i)
    push!(weights, trunc(Int, remaining_cofactor_counts[i] * cofactor_element_corrective_factor[i]))
    colors[n_elements + n_cofactors + n_cplx + n_classes + i] = colors[n_elements+i]
end

# # C -> W remaining cplxes in
for i in 1:n_cplx
    if unclassified_cplx_idxs[i]
        push!(src, n_elements + n_cofactors + i)
        push!(dst, n_elements + n_cofactors + n_cplx + n_classes + n_cofactors + 1)
        push!(weights, trunc(Int, unclassified_cplx_counts[i]))
        colors[n_elements + n_cofactors + n_cplx + n_classes + n_cofactors + 1] = colors[i]
    end
end



[141]:


sankey(src, dst, weights, compact = true, node_labels = node_labels, node_colors = cgrad(:copper, maximum(colors)+3, categorical = true, rev=true)[2:(maximum(colors)+2)][colors], edge_color = :src, size=(1600, 1200), label_position = :right, label_size = 10, force_order = ordering_cofactors, title="Iron cofactor distribution in a single cell" ) # savefig("figures/iron.svg")


[141]:

with fine grained pathways

Redoing the plotting prep

[142]:
cur_elements = ["ZN"]
cap_entries = 50

# create counts matrices
C_P_counts = C .* repeat(counts, 1, length(monomer_names))
C_E_counts = C_P_counts * P * E


# we have to normalize the array because C is not norm-preserving.
C_W = C * W * W2
C_W_norm = Array{Float64}(copy(C_W))

for i in 1:size(C_W_norm)[1]
    C_W_norm[i, :] = C_W[i, :] / (sum(C_W[i, :]) + 0.00001)
end

C_W_norm_counts_cats = C_W_norm

# creating element indices of interest
element_idxs = [element in cur_elements for element in element_names]
element_idxs_nz = findall(vec(element_idxs))

## cplx sorted by counts of elements
capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]


# corrective factor multiplies a cplx by its cofactor stoichiometry
corrective_factor = sum((C * P * E)[capped_complex_idx, element_idxs], dims=2)

# create "remaining elements"
total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))
in_use_element_counts = vec(sum(capped_C_E, dims=1))
remaining_element_counts = vec(total_element_counts - in_use_element_counts)

# create "no class cplx", just use C_E counts
unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
unclassified_cplx_counts = vec(sum(capped_C_E, dims=2))

# create sizes
n_cplx = length(capped_complex_idx)
n_classes = length(capped_classes)
n_elements = length(cur_elements)

# initialize arrays
src = Vector{Int64}()
dst = Vector{Int64}()
weights = Vector{Int64}()

test_class_count = Vector{Int64}()

# create labels
node_labels = [element_names[element_idxs];
                protein_names[capped_complex_idx];
                pathway_names[capped_classes];
                ["Remaining $c proteins" for c  element_names[element_idxs]];
               # "Uncategorized"
]

# create colors
colors = ones(Int64, length(node_labels))

element_colors = collect(1:(n_elements))
other_color = 0

colors[1:n_elements] = element_colors

# create sorting
## sorts elements
ordering_elements = Vector{Pair{Int64, Int64}}()
sort_elements = sortperm(vec(sum(capped_C_E, dims=1)), rev=true)
for i in 1:(n_elements-1)
    push!(ordering_elements, sort_elements[i]=>sort_elements[i+1])
end

# sort_cplxs = sortperm(vec(sum(capped_C_E, dims=2)), rev=true)
# for i in 1:(n_cplx-1)
#     push!(ordering_elements, n_elements+sort_cplxs[i]=>n_elements+sort_cplxs[i+1])
# end

# order of index assignments
# elements, cplx, classes, remaining elements, no class label

# chart progress: E -> C -> W
# E -> C, reverse order because im a dumbass
for i in 1:n_elements
    for j in 1:n_cplx
        if capped_C_E[j, i] != 0

            push!(src, i)
            push!(dst, n_elements + j)
            push!(weights, capped_C_E[j, i])

            colors[n_elements + j] = colors[i]

        end
    end
end

# # C -> W
for i in 1:n_cplx
    for j in 1:n_classes
        if capped_C_W[i, j] != 0

            # corrective factor multiplies by element multiplicity to maintain flow size

            push!(src, n_elements + i)
            push!(dst, n_elements + n_cplx + j)
            push!(weights, trunc(Int64, capped_C_W[i, j] * capped_C_E[i]))

            colors[n_elements + n_cplx + j] = colors[n_elements + i]

            push!(test_class_count, trunc(Int64, capped_C_W[i, j] * capped_C_E[i]))

        end
    end

    if unclassified_cplx_idxs[i]
        push!(src, n_elements + i)
        push!(dst, n_elements + n_cplx + n_classes + n_elements + 1)
        push!(weights, unclassified_cplx_counts[i])

        colors[n_elements + n_cplx + n_classes + n_elements + 1] = colors[n_elements + i]
    end

end

# E -> C remaining cnts
for i in 1:n_elements
    push!(src, i)
    push!(dst, n_elements + n_cplx + n_classes + i)
    push!(weights, remaining_element_counts[i])
    colors[n_elements + n_cplx + n_classes + i] = colors[i]
end

# # C -> P remaining cnts in displayed classes
# for i in 1:n_elements


[132]:
sankey(src, dst, weights,
        compact = true,
        node_labels = node_labels,
        node_colors = cgrad(:copper, maximum(colors)+1, categorical = true, rev=true)[colors],
        edge_color = :src,
        size=(1800, 1000),
        label_position = :right,
        label_size = 12,
        force_order = ordering_cofactors,
        title="How does the cell distribute its zinc across proteins where zinc is part of a cofactor?"
)

# savefig("figures/zinc.svg")
[132]:

Manganese

[96]:
cur_elements = ["MN"]
cap_entries = 15

# create counts matrices
C_P_counts = C .* repeat(counts_min, 1, length(monomer_names))
C_E_counts = C_P_counts * P * E


# we have to normalize the array because C is not norm-preserving.
C_W = C * W * W2
C_W_norm = Array{Float64}(copy(C_W))

for i in 1:size(C_W_norm)[1]
    C_W_norm[i, :] = C_W[i, :] / (sum(C_W[i, :]) + 0.00001)
end

C_W_norm_counts_cats = C_W_norm

# creating element indices of interest
element_idxs = [element in cur_elements for element in element_names]
element_idxs_nz = findall(vec(element_idxs))

## cplx sorted by counts of elements
capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]


# corrective factor multiplies a cplx by its cofactor stoichiometry
corrective_factor = sum((C * P * E)[capped_complex_idx, element_idxs], dims=2)

# create "remaining elements"
total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))
in_use_element_counts = vec(sum(capped_C_E, dims=1))
remaining_element_counts = vec(total_element_counts - in_use_element_counts)

# create "no class cplx", just use C_E counts
unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
unclassified_cplx_counts = vec(sum(capped_C_E, dims=2))

# create sizes
n_cplx = length(capped_complex_idx)
n_classes = length(capped_classes)
n_elements = length(cur_elements)

# initialize arrays
src = Vector{Int64}()
dst = Vector{Int64}()
weights = Vector{Int64}()

# create labels
node_labels = [element_names[element_idxs];
                protein_names[capped_complex_idx];
                pathway_names[capped_classes];
                ["Remaining $c proteins" for c  element_names[element_idxs]]]

# create colors
colors = zeros(Int64, length(node_labels))

element_colors = collect(1:(n_elements))
other_color = 0

colors[1:n_elements] = element_colors

# create sorting
## sorts elements
ordering_elements = Vector{Pair{Int64, Int64}}()
sort_elements = sortperm(vec(sum(capped_C_E, dims=1)), rev=true)
for i in 1:(n_elements-1)
    push!(ordering_elements, sort_elements[i]=>sort_elements[i+1])
end

# sort_cplxs = sortperm(vec(sum(capped_C_E, dims=2)), rev=true)
# for i in 1:(n_cplx-1)
#     push!(ordering_elements, n_elements+sort_cplxs[i]=>n_elements+sort_cplxs[i+1])
# end

# order of index assignments
# elements, cplx, classes, remaining elements, no class label

# chart progress: E -> C -> W
# E -> C, reverse order because im a dumbass
for i in 1:n_elements
    for j in 1:n_cplx
        if capped_C_E[j, i] != 0

            push!(src, i)
            push!(dst, n_elements + j)
            push!(weights, capped_C_E[j, i])

            colors[n_elements + j] = colors[i]

        end
    end
end

# # C -> W
for i in 1:n_cplx
    for j in 1:n_classes
        if capped_C_W[i, j] != 0

            # corrective factor multiplies by element multiplicity to maintain flow size

            push!(src, n_elements + i)
            push!(dst, n_elements + n_cplx + j)
            push!(weights, trunc(Int64, capped_C_W[i, j] * capped_C_E[i]))

            colors[n_elements + n_cplx + j] = colors[n_elements + i]

        end
    end

    if unclassified_cplx_idxs[i]
        push!(src, n_elements + i)
        push!(dst, n_elements + n_cplx + n_classes + n_elements + 1)
        push!(weights, unclassified_cplx_counts[i])

        colors[n_elements + n_cplx + n_classes + n_elements + 1] = colors[n_elements + i]
    end

end

# E -> C remaining cnts
for i in 1:n_elements
    push!(src, i)
    push!(dst, n_elements + n_cplx + n_classes + i)
    push!(weights, remaining_element_counts[i])
    colors[n_elements + n_cplx + n_classes + i] = colors[i]
end

# # C -> P remaining cnts in displayed classes
# for i in 1:n_elements


[97]:
sankey(src, dst, weights,
        compact = true,
        node_labels = node_labels,
        node_colors = cgrad(:copper, maximum(colors)+1, categorical = true, rev=true)[colors],
        edge_color = :src,
        size=(1800, 1000),
        label_position = :right,
        label_size = 12,
        force_order = ordering_cofactors,
        title="How does the cell distribute its manganese across proteins where manganese is part of a cofactor?"
)

# savefig("figures/manganese.svg")
[97]:
[98]:
cur_elements = ["CU"]
cap_entries = 6

# create counts matrices
C_P_counts = C .* repeat(counts, 1, length(monomer_names))
C_E_counts = C_P_counts * P * E


# we have to normalize the array because C is not norm-preserving.
C_W = C * W * W2
C_W_norm = Array{Float64}(copy(C_W))

for i in 1:size(C_W_norm)[1]
    C_W_norm[i, :] = C_W[i, :] / (sum(C_W[i, :]) + 0.00001)
end

C_W_norm_counts_cats = C_W_norm

# creating element indices of interest
element_idxs = [element in cur_elements for element in element_names]
element_idxs_nz = findall(vec(element_idxs))

## cplx sorted by counts of elements
capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]


# corrective factor multiplies a cplx by its cofactor stoichiometry
corrective_factor = sum((C * P * E)[capped_complex_idx, element_idxs], dims=2)

# create "remaining elements"
total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))
in_use_element_counts = vec(sum(capped_C_E, dims=1))
remaining_element_counts = vec(total_element_counts - in_use_element_counts)

# create "no class cplx", just use C_E counts
unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
unclassified_cplx_counts = vec(sum(capped_C_E, dims=2))

# create sizes
n_cplx = length(capped_complex_idx)
n_classes = length(capped_classes)
n_elements = length(cur_elements)

# initialize arrays
src = Vector{Int64}()
dst = Vector{Int64}()
weights = Vector{Int64}()

# create labels
node_labels = [element_names[element_idxs];
                protein_names[capped_complex_idx];
                pathway_names[capped_classes];
                ["Remaining $c proteins" for c  element_names[element_idxs]];]

# create colors
colors = zeros(Int64, length(node_labels))

element_colors = collect(1:(n_elements))
other_color = 0

colors[1:n_elements] = element_colors

# create sorting
## sorts elements
ordering_elements = Vector{Pair{Int64, Int64}}()
sort_elements = sortperm(vec(sum(capped_C_E, dims=1)), rev=true)
for i in 1:(n_elements-1)
    push!(ordering_elements, sort_elements[i]=>sort_elements[i+1])
end

# sort_cplxs = sortperm(vec(sum(capped_C_E, dims=2)), rev=true)
# for i in 1:(n_cplx-1)
#     push!(ordering_elements, n_elements+sort_cplxs[i]=>n_elements+sort_cplxs[i+1])
# end

# order of index assignments
# elements, cplx, classes, remaining elements, no class label

# chart progress: E -> C -> W
# E -> C, reverse order because im a dumbass
for i in 1:n_elements
    for j in 1:n_cplx
        if capped_C_E[j, i] != 0

            push!(src, i)
            push!(dst, n_elements + j)
            push!(weights, capped_C_E[j, i])

            colors[n_elements + j] = colors[i]

        end
    end
end

# # C -> W
for i in 1:n_cplx
    for j in 1:n_classes
        if capped_C_W[i, j] != 0

            # corrective factor multiplies by element multiplicity to maintain flow size

            push!(src, n_elements + i)
            push!(dst, n_elements + n_cplx + j)
            push!(weights, trunc(Int64, capped_C_W[i, j] * capped_C_E[i]))

            colors[n_elements + n_cplx + j] = colors[n_elements + i]

        end
    end

    if unclassified_cplx_idxs[i]
        push!(src, n_elements + i)
        push!(dst, n_elements + n_cplx + n_classes + n_elements + 1)
        push!(weights, unclassified_cplx_counts[i])

        colors[n_elements + n_cplx + n_classes + n_elements + 1] = colors[n_elements + i]
    end

end

# E -> C remaining cnts
for i in 1:n_elements
    push!(src, i)
    push!(dst, n_elements + n_cplx + n_classes + i)
    push!(weights, remaining_element_counts[i])
    colors[n_elements + n_cplx + n_classes + i] = colors[i]
end

# # C -> P remaining cnts in displayed classes
# for i in 1:n_elements


[99]:
sankey(src, dst, weights,
        compact = true,
        node_labels = node_labels,
        node_colors = cgrad(:copper, maximum(colors)+1, categorical = true, rev=true)[colors],
        edge_color = :src,
        size=(1800, 1000),
        label_position = :right,
        label_size = 12,
        force_order = ordering_cofactors,
        title="How does the cell distribute its copper across proteins where manganese is part of a cofactor?"
)

# savefig("figures/manganese.svg")
[99]:
[100]:
cur_elements = ["NI", "CU", "MO", "SE", "CO"]
cap_entries = 25

# create counts matrices
C_P_counts = C .* repeat(counts, 1, length(monomer_names))
C_E_counts = C_P_counts * P * E


# we have to normalize the array because C is not norm-preserving.
C_W = C * W * W2
C_W_norm = Array{Float64}(copy(C_W))

for i in 1:size(C_W_norm)[1]
    C_W_norm[i, :] = C_W[i, :] / (sum(C_W[i, :]) + 0.00001)
end

C_W_norm_counts_cats = C_W_norm

# creating element indices of interest
element_idxs = [element in cur_elements for element in element_names]
element_idxs_nz = findall(vec(element_idxs))

## cplx sorted by counts of elements
capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]


# corrective factor multiplies a cplx by its cofactor stoichiometry
corrective_factor = sum((C * P * E)[capped_complex_idx, element_idxs], dims=2)

# create "remaining elements"
total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))
in_use_element_counts = vec(sum(capped_C_E, dims=1))
remaining_element_counts = vec(total_element_counts - in_use_element_counts)

# create "no class cplx", just use C_E counts
unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
unclassified_cplx_counts = vec(sum(capped_C_E, dims=2))

# create sizes
n_cplx = length(capped_complex_idx)
n_classes = length(capped_classes)
n_elements = length(cur_elements)

# initialize arrays
src = Vector{Int64}()
dst = Vector{Int64}()
weights = Vector{Int64}()

# create labels
node_labels = [element_names[element_idxs];
                protein_names[capped_complex_idx];
                pathway_names[capped_classes];
                ["Remaining $c proteins" for c  element_names[element_idxs]];
                # "Uncategorized"
]

# create colors
colors = zeros(Int64, length(node_labels))

element_colors = collect(1:(n_elements))
other_color = 0

colors[1:n_elements] = element_colors

# create sorting
## sorts elements
ordering_elements = Vector{Pair{Int64, Int64}}()
sort_elements = sortperm(vec(sum(capped_C_E, dims=1)), rev=true)
for i in 1:(n_elements-1)
    push!(ordering_elements, sort_elements[i]=>sort_elements[i+1])
end

# sort_cplxs = sortperm(vec(sum(capped_C_E, dims=2)), rev=true)
# for i in 1:(n_cplx-1)
#     push!(ordering_elements, n_elements+sort_cplxs[i]=>n_elements+sort_cplxs[i+1])
# end

# order of index assignments
# elements, cplx, classes, remaining elements, no class label

# chart progress: E -> C -> W
# E -> C, reverse order because im a dumbass
for i in 1:n_elements
    for j in 1:n_cplx
        if capped_C_E[j, i] != 0

            push!(src, i)
            push!(dst, n_elements + j)
            push!(weights, capped_C_E[j, i])

            colors[n_elements + j] = colors[i]

        end
    end
end

# # C -> W
for i in 1:n_cplx
    for j in 1:n_classes
        if capped_C_W[i, j] != 0

            # corrective factor multiplies by element multiplicity to maintain flow size

            push!(src, n_elements + i)
            push!(dst, n_elements + n_cplx + j)
            push!(weights, trunc(Int64, capped_C_W[i, j] * vec(sum(capped_C_E, dims=2))[i]))

            colors[n_elements + n_cplx + j] = colors[n_elements + i]

        end
    end

    if unclassified_cplx_idxs[i]
        push!(src, n_elements + i)
        push!(dst, n_elements + n_cplx + n_classes + n_elements + 1)
        push!(weights, unclassified_cplx_counts[i])

        colors[n_elements + n_cplx + n_classes + n_elements + 1] = colors[n_elements + i]
    end

end

# # E -> C remaining cnts
# for i in 1:n_elements
#     push!(src, i)
#     push!(dst, n_elements + n_cplx + n_classes + i)
#     push!(weights, remaining_element_counts[i])
#     colors[n_elements + n_cplx + n_classes + i] = colors[i]
# end

# # C -> P remaining cnts in displayed classes
# for i in 1:n_elements


[101]:
src
[101]:
52-element Vector{Int64}:
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  3
  3
  ⋮
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 30
[102]:
sankey(src, dst, weights,
        compact = true,
        node_labels = node_labels,
        node_colors = cgrad(:copper, maximum(colors)+1, categorical = true, rev=true)[colors],
        edge_color = :src,
        size=(1800, 1000),
        label_position = :right,
        label_size = 12,
        force_order = ordering_cofactors,
        title="Rarely used, yet important  metals: copper, molybdenum, nickel, cobalt and selenium"
)

# savefig("figures/other_metals.svg")
BoundsError: attempt to access 6-element Array{RGBA{Float64},1} with eltype RGBA{Float64} at index [[1, 2, 3, 4, 5, 5, 5, 5, 1, 4  …  5, 5, 3, 5, 1, 0, 0, 0, 0, 0]]

Stacktrace:
 [1] throw_boundserror(A::Vector{RGBA{Float64}}, I::Tuple{Vector{Int64}})
   @ Base ./abstractarray.jl:744
 [2] checkbounds
   @ ./abstractarray.jl:709 [inlined]
 [3] _getindex
   @ ./multidimensional.jl:860 [inlined]
 [4] getindex(A::Vector{RGBA{Float64}}, I::Vector{Int64})
   @ Base ./abstractarray.jl:1294
 [5] getindex(acl::PlotUtils.CategoricalColorGradient, x::Vector{Int64})
   @ PlotUtils ~/.julia/packages/PlotUtils/mHQ0Q/src/colorschemes.jl:12
 [6] top-level scope
   @ In[102]:1

Create data set for Tim

[103]:
all_elements = ["FE", "ZN", "MN", "CU", "NI", "MO", "SE", "CO"]

df = DataFrame(name=["test"], element_id=["N/A"], ecocyc_id=["N/A"], plot_id=[-1], count=[-1], parent=[-1])

for q in 1:length(all_elements)
    cur_elements = [all_elements[q]]
    cap_entries = 100

    # create counts matrices
    C_P_counts = C .* repeat(counts_min, 1, length(monomer_names))
    C_E_counts = C_P_counts * P * E


    # we have to normalize the array because C is not norm-preserving.
    C_W = C * W * W2
    C_W_norm = Array{Float64}(copy(C_W))

    for i in 1:size(C_W_norm)[1]
        C_W_norm[i, :] = C_W[i, :] / (sum(C_W[i, :]) + 0.00001)
    end

    C_W_norm_counts_cats = C_W_norm

    # creating element indices of interest
    element_idxs = [element in cur_elements for element in element_names]
    element_idxs_nz = findall(vec(element_idxs))

    ## cplx sorted by counts of elements
    capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]


    capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

    capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
    capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]


    # corrective factor multiplies a cplx by its cofactor stoichiometry
    corrective_factor = sum((C * P * E)[capped_complex_idx, element_idxs], dims=2)

    # create "remaining elements"
    total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))
    in_use_element_counts = vec(sum(capped_C_E, dims=1))
    remaining_element_counts = vec(total_element_counts - in_use_element_counts)

    # create "no class cplx", just use C_E counts
    unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
    unclassified_cplx_counts = vec(sum(capped_C_E, dims=2))

    # create sizes
    n_cplx = length(capped_complex_idx)
    n_classes = length(capped_classes)
    n_elements = length(cur_elements)

    # initialize arrays
    src = Vector{Int64}()
    dst = Vector{Int64}()
    weights = Vector{Int64}()

    # create labels
    node_labels = [element_names[element_idxs];
                    protein_names[capped_complex_idx];
                    pathway_names[capped_classes];
                    ["Remaining $c proteins" for c  element_names[element_idxs]];
                    "Uncategorized"]

    # create labels
    node_ids = [element_ids[element_idxs];
                    protein_ids[capped_complex_idx];
                    pathway_ids[capped_classes];
                    ["Remaining $c proteins" for c  element_ids[element_idxs]];
                    "Uncategorized"]

    # create colors
    colors = zeros(Int64, length(node_labels))

    element_colors = collect(1:(n_elements))
    other_color = 0

    colors[1:n_elements] = element_colors

    # create sorting
    ## sorts elements
    ordering_elements = Vector{Pair{Int64, Int64}}()
    sort_elements = sortperm(vec(sum(capped_C_E, dims=1)), rev=true)
    for i in 1:(n_elements-1)
        push!(ordering_elements, sort_elements[i]=>sort_elements[i+1])
    end

    # sort_cplxs = sortperm(vec(sum(capped_C_E, dims=2)), rev=true)
    # for i in 1:(n_cplx-1)
    #     push!(ordering_elements, n_elements+sort_cplxs[i]=>n_elements+sort_cplxs[i+1])
    # end

    # order of index assignments
    # elements, cplx, classes, remaining elements, no class label

    # chart progress: E -> C -> W
    # E -> C, reverse order because im a dumbass
    for i in 1:n_elements
        for j in 1:n_cplx
            if capped_C_E[j, i] != 0

                push!(src, i)
                push!(dst, n_elements + j)
                push!(weights, capped_C_E[j, i])

                colors[n_elements + j] = colors[i]

            end
        end
    end

    # # C -> W
    for i in 1:n_cplx
        for j in 1:n_classes
            if capped_C_W[i, j] != 0

                # corrective factor multiplies by element multiplicity to maintain flow size

                push!(src, n_elements + i)
                push!(dst, n_elements + n_cplx + j)
                push!(weights, trunc(Int64, capped_C_W[i, j] * vec(sum(capped_C_E, dims=2))[i]))

                colors[n_elements + n_cplx + j] = colors[n_elements + i]

            end
        end

        if unclassified_cplx_idxs[i]
            push!(src, n_elements + i)
            push!(dst, n_elements + n_cplx + n_classes + n_elements + 1)
            push!(weights, unclassified_cplx_counts[i])

            colors[n_elements + n_cplx + n_classes + n_elements + 1] = colors[n_elements + i]
        end

    end

    # E -> C remaining cnts
    for i in 1:n_elements
        push!(src, i)
        push!(dst, n_elements + n_cplx + n_classes + i)
        push!(weights, remaining_element_counts[i])
        colors[n_elements + n_cplx + n_classes + i] = colors[i]
    end

    # # C -> P remaining cnts in displayed classes
    # for i in 1:n_elements



    class_idx = Vector{Int64}()
    class_totals = Vector{Int64}()
    class_parents = Vector{Int64}()
    class_names = Vector{String}()
    class_ids = Vector{String}()

    cplx_idx = Vector{Int64}()
    cplx_parents = Vector{Int64}()
    cplx_counts = Vector{Int64}()
    cplx_names = Vector{String}()
    cplx_ids = Vector{String}()

    for flow in 1:length(src)

        cur_src = src[flow]
        cur_dst = dst[flow]
        cur_weight = weights[flow]
        src_label = node_labels[cur_src]
        dst_label = node_labels[cur_dst]
        src_id = node_ids[cur_src]
        dst_id = node_ids[cur_dst]


        # print(cur_src," ", cur_dst," ", cur_weight, "\n")

        if cur_dst > n_elements + n_cplx && cur_dst < n_elements + n_cplx + n_classes + 1

            if cur_dst in class_idx
                cur_idx = class_idx .== cur_dst
                class_totals[cur_idx] .+= cur_weight
            else
                push!(class_idx, cur_dst)
                push!(class_totals, cur_weight)
                push!(class_names, dst_label)
                push!(class_ids, dst_id)
                push!(class_parents, 1)
            end

            push!(cplx_idx, cur_src)
            push!(cplx_parents, cur_dst)
            push!(cplx_counts, cur_weight)
            push!(cplx_names, src_label)
            push!(cplx_ids, src_id)

        end

    end

    # to differ metals, add 10000 times q.
    class_idx .+= (q*10000)
    class_parents .+= (q*10000)
    cplx_idx .+= (q*10000)
    cplx_parents .+= (q*10000)

    df_element = DataFrame(name=cur_elements[1], element_id=cur_elements[1], ecocyc_id="N/A",
        plot_id = (q*10000)+1, count=sum(total_element_counts), parent=-1)
    df_inner = DataFrame(name=class_names, element_id=cur_elements[1], ecocyc_id=class_ids, plot_id=class_idx, count=class_totals, parent=class_parents)
    df_outer = DataFrame(name=cplx_names, element_id=cur_elements[1], ecocyc_id=cplx_ids, plot_id=cplx_idx, count=cplx_counts, parent=cplx_parents)

    df = vcat(df, df_element, df_inner, df_outer)

end

[104]:
CSV.write("tim_cofactors_minimal.csv", df)
[104]:
"tim_cofactors_minimal.csv"

organic cofactors

[105]:
cur_elements = ["FAD", "FMN", "thiamine diphosphate", "pyridoxal 5'-phosphate", "biotin", "(<i>R</i>)-lipoate"]
cap_entries = 20

# create counts matrices
C_P_counts = C .* repeat(counts, 1, length(monomer_names))
C_E_counts = C_P_counts * P #  * E


# we have to normalize the array because C is not norm-preserving.
C_W = C * W * W2
C_W_norm = Array{Float64}(copy(C_W))

for i in 1:size(C_W_norm)[1]
    C_W_norm[i, :] = C_W[i, :] / (sum(C_W[i, :]) + 0.00001)
end

C_W_norm_counts_cats = C_W_norm

# creating element indices of interest
element_idxs = [element in cur_elements for element in cofactor_names]
element_idxs_nz = findall(vec(element_idxs))

## cplx sorted by counts of elements
capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]


# corrective factor multiplies a cplx by its cofactor stoichiometry
corrective_factor = sum((C * P)[capped_complex_idx, element_idxs], dims=2)

# create "remaining elements"
total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))
in_use_element_counts = vec(sum(capped_C_E, dims=1))
remaining_element_counts = vec(total_element_counts - in_use_element_counts)

# create "no class cplx", just use C_E counts
unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
unclassified_cplx_counts = vec(sum(capped_C_E, dims=2))

# create sizes
n_cplx = length(capped_complex_idx)
n_classes = length(capped_classes)
n_elements = length(cur_elements)

# initialize arrays
src = Vector{Int64}()
dst = Vector{Int64}()
weights = Vector{Int64}()

# create labels
node_labels = [cofactor_names[element_idxs];
                protein_names[capped_complex_idx];
                pathway_names[capped_classes];
                ["Remaining $c proteins" for c  cofactor_names[element_idxs]];
                "Uncategorized"]

# create colors
colors = zeros(Int64, length(node_labels))

element_colors = collect(1:(n_elements))
other_color = 0

colors[1:n_elements] = element_colors

# create sorting
## sorts elements
ordering_elements = Vector{Pair{Int64, Int64}}()
sort_elements = sortperm(vec(sum(capped_C_E, dims=1)), rev=true)
for i in 1:(n_elements-1)
    push!(ordering_elements, sort_elements[i]=>sort_elements[i+1])
end

# sort_cplxs = sortperm(vec(sum(capped_C_E, dims=2)), rev=true)
# for i in 1:(n_cplx-1)
#     push!(ordering_elements, n_elements+sort_cplxs[i]=>n_elements+sort_cplxs[i+1])
# end

# order of index assignments
# elements, cplx, classes, remaining elements, no class label

# chart progress: E -> C -> W
# E -> C, reverse order because im a dumbass
for i in 1:n_elements
    for j in 1:n_cplx
        if capped_C_E[j, i] != 0

            push!(src, i)
            push!(dst, n_elements + j)
            push!(weights, capped_C_E[j, i])

            colors[n_elements + j] = colors[i]

        end
    end
end

# # C -> W
for i in 1:n_cplx
    for j in 1:n_classes
        if capped_C_W[i, j] != 0

            # corrective factor multiplies by element multiplicity to maintain flow size

            push!(src, n_elements + i)
            push!(dst, n_elements + n_cplx + j)
            push!(weights, trunc(Int64, capped_C_W[i, j] * vec(sum(capped_C_E, dims=2))[i]))

            colors[n_elements + n_cplx + j] = colors[n_elements + i]

        end
    end

    if unclassified_cplx_idxs[i]
        push!(src, n_elements + i)
        push!(dst, n_elements + n_cplx + n_classes + n_elements + 1)
        push!(weights, unclassified_cplx_counts[i])

        colors[n_elements + n_cplx + n_classes + n_elements + 1] = colors[n_elements + i]
    end

end

# E -> C remaining cnts
for i in 1:n_elements
    push!(src, i)
    push!(dst, n_elements + n_cplx + n_classes + i)
    push!(weights, remaining_element_counts[i])
    colors[n_elements + n_cplx + n_classes + i] = colors[i]
end

# # C -> P remaining cnts in displayed classes
# for i in 1:n_elements


[106]:
sankey(src, dst, weights,
        compact = true,
        node_labels = node_labels,
        node_colors = cgrad(:copper, maximum(colors)+1, categorical = true, rev=true)[colors],
        edge_color = :src,
        size=(1800, 1000),
        label_position = :right,
        label_size = 12,
        force_order = ordering_elements,
        title="How does the cell distribute it's precious iron across proteins where iron is part of a cofactor?"
)

# savefig("figures/organic.svg")
[106]:

Histograms

Specialist vs generalist, CDF

[35]:
top_utilizers_per_metal = Vector{Vector{Float64}}()

elements = ["FE", "ZN", "MN", "MO","CU", "SE", "NI", "CO"]

for element in elements
    cur_elements = [element]
    cap_entries = 250

    # create counts matrices
    C_P_counts = C .* repeat(counts, 1, length(monomer_names))
    C_E_counts = C_P_counts * P * E

    # creating element indices of interest
    element_idxs = [element in cur_elements for element in element_names]
    element_idxs_nz = findall(vec(element_idxs))

    ## cplx sorted by counts of elements
    capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
    capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

    capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
    capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]

    total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))


    push!(top_utilizers_per_metal, vec(capped_C_E ./ total_element_counts))

end


# to matrix
top_utilizers_per_metal = mapreduce(permutedims, vcat, top_utilizers_per_metal)



[35]:
8×250 Matrix{Float64}:
 0.0829005  0.0447835  0.0446907  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.158264   0.114675   0.0655838     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.668145   0.118116   0.0700461     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.534487   0.295241   0.0397048     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.562779   0.2047     0.204117      0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.807692   0.192308   0.0        …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.668326   0.0721842  0.0678283     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.982026   0.0179739  0.0           0.0  0.0  0.0  0.0  0.0  0.0  0.0
[36]:
cdf_df = DataFrame(transpose(cumsum(top_utilizers_per_metal, dims=2)), elements)
cdf_df[!, "Protein_n"] = 1:size(cdf_df)[1]
cdf_df

cdf_df = stack(cdf_df, variable_name="Metal", value_name="Cumulative_fraction")
[36]:
2000×3 DataFrame
1975 rows omitted
RowProtein_nMetalCumulative_fraction
Int64StringFloat64
11FE0.0829005
22FE0.127684
33FE0.172375
44FE0.213015
55FE0.252372
66FE0.288472
77FE0.322877
88FE0.351741
99FE0.378027
1010FE0.40248
1111FE0.426327
1212FE0.448166
1313FE0.469803
1989239CO1.0
1990240CO1.0
1991241CO1.0
1992242CO1.0
1993243CO1.0
1994244CO1.0
1995245CO1.0
1996246CO1.0
1997247CO1.0
1998248CO1.0
1999249CO1.0
2000250CO1.0
[37]:
@df cdf_df plot(:Protein_n, :Cumulative_fraction, group=:Metal, width=5, xlabel="Protein n, sorted", ylabel="Cumulative fraction of metal capacity")
[37]:
[38]:
@df cdf_df plot(:Protein_n, :Cumulative_fraction, group=:Metal, width=5, xlabel="Protein n, sorted", ylabel="Cumulative fraction of metal capacity",
xlims=(1, 20))
[38]:

Avg. protein expression for different metals

[39]:
C_P_counts = C .* repeat(counts, 1, length(monomer_names))
C_E_counts = C_P_counts * P * E
[39]:
5527×19 Matrix{Int64}:
 0  0    0  0     0  0  0  0  0  0      0  …    0     0      0     0      0
 0  0  740  0     0  0  0  0  0  0      0       0  3330  14430     0  45510
 0  0    0  0     0  0  0  0  0  0      0       0     0      0  5390      0
 0  0    0  0     0  0  0  0  0  0  21228       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0  5580      0
 0  0    0  0     0  0  0  0  0  0      0  …    0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0  5008      0
 0  0    0  0     0  0  0  0  0  0      0     270     0      0   270      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0  362  0     0  0  0  0  0  0      0  …    0   362   2172     0   2896
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 ⋮                   ⋮                  ⋮  ⋱          ⋮
 0  0    0  0     0  0  0  0  0  0      0  …    0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0  …    0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0  1247  0  0  0  0  0      0       0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0  …    0     0      0     0      0
 0  0    0  0     0  0  0  0  0  0      0       0     0      0     0      0
[40]:
cur_elements = ["FE", "ZN", "MO", "NI", "SE"]
fraction_expressed = Vector{Float64}()

for element in cur_elements

    cplx_with_element = findall(C * P * vec(E[:, element_ids .== element]) .!= 0)

    push!(fraction_expressed, sum(counts[cplx_with_element] .>= 100) / length(counts[cplx_with_element]))

end

push!(cur_elements, "Random")
push!(fraction_expressed, sum(counts .>= 100) / length(counts))
[40]:
6-element Vector{Float64}:
 0.32575757575757575
 0.44813278008298757
 0.10714285714285714
 0.13333333333333333
 0.0
 0.4145105844038357
[41]:
fraction_order = sortperm(fraction_expressed)

bar(fraction_expressed[fraction_order], xticks=(1:length(cur_elements), cur_elements[fraction_order]), title="Fraction of proteins with count>100 in standard rich")
[41]:

Combined

[46]:
cur_elements = ["FE", "ZN", "MN", "CU", "MO", "NI", "CO"]

val_data = [vec([180000 200000 5000 10000 1000 0 0]) vec([25000 25000 5000 10000 1000 1000 1000]) cur_elements ["Minimal" for i in 1:7];
vec([600000 400000 50000 70000 5000 0 0]) vec([100000 50000 10000 20000 1000 1000 1000]) cur_elements ["Rich" for i in 1:7]]

val_df = DataFrame(val_data, ["Counts", "Sd", "Element", "Media"])
[46]:
14×4 DataFrame
RowCountsSdElementMedia
AnyAnyAnyAny
118000025000FEMinimal
220000025000ZNMinimal
350005000MNMinimal
41000010000CUMinimal
510001000MOMinimal
601000NIMinimal
701000COMinimal
8600000100000FERich
940000050000ZNRich
105000010000MNRich
117000020000CURich
1250001000MORich
1301000NIRich
1401000CORich
[47]:
element_timeseries = total_counts * C * P * E

cur_elements = ["FE", "ZN", "MN", "CU", "MO", "NI", "CO"]

element_idx = [findfirst(==(element), element_names) for element in cur_elements]

current_series = element_timeseries[:,element_idx]

averages = sum(current_series, dims=1) / size(current_series)[1]

series_df = DataFrame(current_series, cur_elements)
stack_series_df =stack(series_df, 1:length(cur_elements), variable_name="Element", value_name="Counts")
stack_series_df[!, "Media"] = ["Rich" for i in 1:size(stack_series_df)[1]]
stack_series_df[!, "Volume"] = [4 for i in 1:size(stack_series_df)[1]]



element_timeseries = total_counts_min * C * P * E

current_series = element_timeseries[:,element_idx]

series_df = DataFrame(current_series, cur_elements)
stack_series_df_2 =stack(series_df, 1:length(cur_elements), variable_name="Element", value_name="Counts")
stack_series_df_2[!, "Media"] = ["Minimal" for i in 1:size(stack_series_df_2)[1]]
stack_series_df_2[!, "Volume"] = [2 for i in 1:size(stack_series_df_2)[1]]


element_timeseries = total_counts_anaero * C * P * E

current_series = element_timeseries[:,element_idx]

series_df = DataFrame(current_series, cur_elements)
stack_series_df_3 =stack(series_df, 1:length(cur_elements), variable_name="Element", value_name="Counts")
stack_series_df_3[!, "Media"] = ["Anaerobic" for i in 1:size(stack_series_df_3)[1]]
stack_series_df_3[!, "Volume"] = [1 for i in 1:size(stack_series_df_3)[1]]



element_timeseries = total_counts_forecast * C * P * E

current_series = element_timeseries[:,element_idx]

series_df = DataFrame(current_series, cur_elements)
stack_series_df_4 =stack(series_df, 1:length(cur_elements), variable_name="Element", value_name="Counts")
stack_series_df_4[!, "Media"] = ["Anaerobic + NO3" for i in 1:size(stack_series_df_4)[1]]
stack_series_df_4[!, "Volume"] = [1 for i in 1:size(stack_series_df_4)[1]]

element_timeseries = total_counts_forecast_sox * C * P * E
current_series = element_timeseries[:,element_idx]

series_df = DataFrame(current_series, cur_elements)
stack_series_df_5 =stack(series_df, 1:length(cur_elements), variable_name="Element", value_name="Counts")
stack_series_df_5[!, "Media"] = ["Rich + sox" for i in 1:size(stack_series_df_5)[1]]
stack_series_df_5[!, "Volume"] = [4 for i in 1:size(stack_series_df_5)[1]]


total_df = vcat(stack_series_df, stack_series_df_2, stack_series_df_3, stack_series_df_4, stack_series_df_5)
total_df[!, "Concentration"] = total_df[!, "Counts"] ./ total_df[!, "Volume"]
[47]:
112735-element Vector{Float64}:
 93673.0
 93702.0
 93729.0
 93751.0
 93795.75
 93824.0
 93857.5
 93904.75
 93941.0
 93968.5
 94014.5
 94048.75
 94088.5
     ⋮
   344.0
   344.0
   344.0
   344.0
   344.0
   344.0
   344.0
   344.0
   344.0
   344.5
   344.5
   344.5
[48]:
@df total_df groupedboxplot(:Element, :Counts, group=:Media,
    title="Simulation counts across conditions", xlabel="Element", ylabel="Counts per cell",
    yaxis=:log,
    ylims=[100,1000000]
)
# @df val_df dotplot!(:Element, :Counts, group=:Media,  markerstrokewidth = 2, label="Validation")
# @df stack_val_df scatter!(:metal, :count, yerror=[25000, 25000, 5000, 10000], markerstrokewidth = 2, label="Validation")
# , label="Simulation", title="Minimal media (low Mn)", xlabel="Element", ylabel="Counts per cell"
[48]:
[49]:
@df total_df groupedboxplot(:Element, :Concentration, group=:Media,
    title="Simulation concentrations across conditions", xlabel="Element", ylabel="Concentration per cell",
    yaxis=:log,
    ylims=[100,1000000]
)
# @df val_df dotplot!(:Element, :Counts, group=:Media,  markerstrokewidth = 2, label="Validation")
# @df stack_val_df scatter!(:metal, :count, yerror=[25000, 25000, 5000, 10000], markerstrokewidth = 2, label="Validation")
# , label="Simulation", title="Minimal media (low Mn)", xlabel="Element", ylabel="Counts per cell"
[49]:
[50]:
@df val_df scatter(:Element, :Counts, group=:Media, yerror=:Sd, markerstrokewidth = 2)
[50]:

Methionine sketch

[52]:
data = [2000 280000;
        400000 700000;
        180000 16000;
        2000 80000;
        2000 80000]

ctg = repeat(["+met", "-met"], inner = 5)
nam = repeat(["metE counts", "zinc atoms per cell", "rpmE counts", "ykgM counts", "ykgO counts"], outer = 2)
[52]:
10-element Vector{String}:
 "metE counts"
 "zinc atoms per cell"
 "rpmE counts"
 "ykgM counts"
 "ykgO counts"
 "metE counts"
 "zinc atoms per cell"
 "rpmE counts"
 "ykgM counts"
 "ykgO counts"
[53]:
groupedbar(nam, data, group=ctg)
[53]:
[54]:
l = @layout [grid(2,2)]

plot(
    transpose(data),
    layout = l, legend = false, seriestype = [:bar],
    title = ["($i)" for j in 1:1, i in 1:11], titleloc = :right, titlefont = font(8)
)
[54]:

Organic

[55]:
element_timeseries = total_counts * C * P

cur_elements = ["FAD", "FMN", "thiamine diphosphate", "pyridoxal 5'-phosphate", "biotin", "(<i>R</i>)-lipoate"]

element_idx = [element in cur_elements for element in cofactor_names]

sorted_idxs = sortperm(element_timeseries[1,element_idx], rev=true)

plot(bar(cofactor_names[element_idx][sorted_idxs], element_timeseries[1,element_idx][sorted_idxs]), legend = false, size=[1000,600],
    title="How many functional organic cofactors are sequestered in proteins (sim)", xlabel="Cofactor", ylabel="Counts per cell")

# savefig("figures/organic_counts.svg")
[55]:

Premium Amino Acids (tm)

[56]:
aa_ids
[56]:
22-element Vector{String}:
 "TYR"
 "VAL"
 "TER"
 "HIS"
 "PRO"
 "TRP"
 "GLN"
 "GLU"
 "PHE"
 "SEL"
 "GLY"
 "THR"
 "ALA"
 "ILE"
 "ASP"
 "ARG"
 "SER"
 "LEU"
 "LYS"
 "ASN"
 "MET"
 "CYS"
[57]:
cur_elements = ["TRP"]
cap_entries = 50

# create counts matrices
C_P_counts = C .* repeat(counts, 1, length(monomer_names))
C_E_counts = C_P_counts * A #  * E


# we have to normalize the array because C is not norm-preserving.
C_W = C * W * W2
C_W_norm = Array{Float64}(copy(C_W))

for i in 1:size(C_W_norm)[1]
    C_W_norm[i, :] = C_W[i, :] / (sum(C_W[i, :]) + 0.00001)
end

C_W_norm_counts_cats = C_W_norm

# creating element indices of interest
element_idxs = [element in cur_elements for element in aa_ids]
element_idxs_nz = findall(vec(element_idxs))

## cplx sorted by counts of elements
capped_complex_idx = sortperm(vec(sum(C_E_counts[:, element_idxs], dims=2)), rev=true)[1:cap_entries]
capped_classes = findall(vec(sum(C_W_norm_counts_cats[capped_complex_idx, :],dims=1) .!= 0))

capped_C_E = C_E_counts[capped_complex_idx, element_idxs]
capped_C_W = C_W_norm_counts_cats[capped_complex_idx, capped_classes]


# corrective factor multiplies a cplx by its cofactor stoichiometry
corrective_factor = sum((C * A)[capped_complex_idx, element_idxs], dims=2)

# create "remaining elements"
total_element_counts = vec(sum(C_E_counts[:, element_idxs], dims=1))
in_use_element_counts = vec(sum(capped_C_E, dims=1))
remaining_element_counts = vec(total_element_counts - in_use_element_counts)

# create "no class cplx", just use C_E counts
unclassified_cplx_idxs = vec(sum(capped_C_W, dims=2) .== 0)
unclassified_cplx_counts = vec(sum(capped_C_E, dims=2))

# create sizes
n_cplx = length(capped_complex_idx)
n_classes = length(capped_classes)
n_elements = length(cur_elements)

# initialize arrays
src = Vector{Int64}()
dst = Vector{Int64}()
weights = Vector{Int64}()

# create labels
node_labels = [aa_ids[element_idxs];
                unwrapped_protein_names[capped_complex_idx];
                unwrapped_pathway_names[capped_classes];
                ["Remaining $c proteins" for c  aa_ids[element_idxs]];
                "Uncategorized"]

# create colors
colors = zeros(Int64, length(node_labels))

element_colors = collect(1:(n_elements))
other_color = 0

colors[1:n_elements] = element_colors

# create sorting
## sorts elements
ordering_elements = Vector{Pair{Int64, Int64}}()
sort_elements = sortperm(vec(sum(capped_C_E, dims=1)), rev=true)
for i in 1:(n_elements-1)
    push!(ordering_elements, sort_elements[i]=>sort_elements[i+1])
end

# sort_cplxs = sortperm(vec(sum(capped_C_E, dims=2)), rev=true)
# for i in 1:(n_cplx-1)
#     push!(ordering_elements, n_elements+sort_cplxs[i]=>n_elements+sort_cplxs[i+1])
# end

# order of index assignments
# elements, cplx, classes, remaining elements, no class label

# chart progress: E -> C -> W
# E -> C, reverse order because im a dumbass
for i in 1:n_elements
    for j in 1:n_cplx
        if capped_C_E[j, i] != 0

            push!(src, i)
            push!(dst, n_elements + j)
            push!(weights, capped_C_E[j, i])

            colors[n_elements + j] = colors[i]

        end
    end
end

# # C -> W
for i in 1:n_cplx
    for j in 1:n_classes
        if capped_C_W[i, j] != 0

            # corrective factor multiplies by element multiplicity to maintain flow size

            push!(src, n_elements + i)
            push!(dst, n_elements + n_cplx + j)
            push!(weights, trunc(Int64, capped_C_W[i, j] * vec(sum(capped_C_E, dims=2))[i]))

            colors[n_elements + n_cplx + j] = colors[n_elements + i]

        end
    end

    if unclassified_cplx_idxs[i]
        push!(src, n_elements + i)
        push!(dst, n_elements + n_cplx + n_classes + n_elements + 1)
        push!(weights, unclassified_cplx_counts[i])

        colors[n_elements + n_cplx + n_classes + n_elements + 1] = colors[n_elements + i]
    end

end

# E -> C remaining cnts
for i in 1:n_elements
    push!(src, i)
    push!(dst, n_elements + n_cplx + n_classes + i)
    push!(weights, remaining_element_counts[i])
    colors[n_elements + n_cplx + n_classes + i] = colors[i]
end

# # C -> P remaining cnts in displayed classes
# for i in 1:n_elements


[58]:
sankey(src, dst, weights,
        compact = true,
        node_labels = node_labels,
        node_colors = cgrad(:copper, maximum(colors)+1, categorical = true, rev=true)[colors],
        edge_color = :src,
        size=(1800, 1000),
        label_position = :right,
        label_size = 12,
        force_order = ordering_elements,
        title="How does the cell distribute it's precious TRP across proteins where TRP is part of a cofactor?"
)

# savefig("figures/general_ressource_aa.svg")
[58]:

Proteome mass fraction cofactor pathways

[59]:
W_comb = W * W2
W_norm = Array{Float64}(copy(W_comb))

for i in 1:size(W_norm)[1]
    W_norm[i, :] = W_norm[i, :] / (sum(W_norm[i, :]) + 0.00001)
end
[60]:
top_protein_masses = transpose(Diagonal(monomer_masses)) * transpose(C) * vec(counts_min)
top_protein_masses = Array{Float64}(top_protein_masses)
top_protein_masses = top_protein_masses ./ (10^(10))
top_protein_masses = top_protein_masses ./ (6.023 * 10^(13))

top_L2_rank = sortperm(top_protein_masses, rev=true)
total_protein_mass = sum(top_protein_masses)

[monomer_names[top_L2_rank] top_protein_masses[top_L2_rank]][1:40, :]
[60]:
40×2 Matrix{Any}:
 "HOMOCYSMET-MONOMER"                                           6.08977e-15
 "translation elongation factor\nTu 1"                          4.58126e-15
 "outer membrane protein A"                                     4.30041e-15
 "glyceraldehyde-3-phosphate\ndehydrogenase A"                  4.21251e-15
 "outer membrane porin F"                                       4.11633e-15
 "murein lipoprotein"                                           3.4308e-15
 "30S ribosomal subunit protein\nS1"                            3.15456e-15
 "DLP12 prophage; omptin family\nouter membrane protease OmpT"  2.94134e-15
 "isocitrate dehydrogenase"                                     2.72041e-15
 "50S ribosomal subunit protein\nL12"                           2.68308e-15
 "elongation factor G"                                          2.36134e-15
 "ketol-acid reductoisomerase\n(NADP<sup>+</sup>)"              2.29257e-15
 "alkyl hydroperoxide reductase,\nAhpC component"               2.25171e-15
 ⋮
 "enolase"                                                      1.2948e-15
 "30S ribosomal subunit protein\nS7"                            1.24429e-15
 "RNA polymerase subunit &beta;"                                1.20506e-15
 "RNA polymerase subunit &beta;'"                               1.18131e-15
 "PQQ-like domain-containing\nprotein YncE"                     1.15228e-15
 "30S ribosomal subunit protein\nS5"                            1.14751e-15
 "trigger factor"                                               1.14387e-15
 "50S ribosomal subunit protein\nL16"                           1.14372e-15
 "cysteine synthase A"                                          1.1427e-15
 "30S ribosomal subunit protein\nS4"                            1.11724e-15
 "iron catecholate outer\nmembrane transporter Fiu"             1.07793e-15
 "30S ribosomal subunit protein\nS21"                           1.07177e-15
[61]:
[monomer_names[top_L2_rank] top_protein_masses[top_L2_rank]/total_protein_mass][1:40, :]
[61]:
40×2 Matrix{Any}:
 "HOMOCYSMET-MONOMER"                                           0.0260675
 "translation elongation factor\nTu 1"                          0.0196103
 "outer membrane protein A"                                     0.0184081
 "glyceraldehyde-3-phosphate\ndehydrogenase A"                  0.0180318
 "outer membrane porin F"                                       0.0176201
 "murein lipoprotein"                                           0.0146857
 "30S ribosomal subunit protein\nS1"                            0.0135032
 "DLP12 prophage; omptin family\nouter membrane protease OmpT"  0.0125905
 "isocitrate dehydrogenase"                                     0.0116448
 "50S ribosomal subunit protein\nL12"                           0.011485
 "elongation factor G"                                          0.0101078
 "ketol-acid reductoisomerase\n(NADP<sup>+</sup>)"              0.00981344
 "alkyl hydroperoxide reductase,\nAhpC component"               0.00963851
 ⋮
 "enolase"                                                      0.00554244
 "30S ribosomal subunit protein\nS7"                            0.00532622
 "RNA polymerase subunit &beta;"                                0.00515831
 "RNA polymerase subunit &beta;'"                               0.00505663
 "PQQ-like domain-containing\nprotein YncE"                     0.00493235
 "30S ribosomal subunit protein\nS5"                            0.00491195
 "trigger factor"                                               0.00489637
 "50S ribosomal subunit protein\nL16"                           0.00489572
 "cysteine synthase A"                                          0.00489136
 "30S ribosomal subunit protein\nS4"                            0.00478237
 "iron catecholate outer\nmembrane transporter Fiu"             0.00461413
 "30S ribosomal subunit protein\nS21"                           0.00458775
[62]:
top_L2_classes_protein_masses = (transpose(W_norm) * transpose(Diagonal(monomer_masses)) * transpose(C) * vec(counts_min))
top_L2_classes_protein_masses = Array{Float64}(top_L2_classes_protein_masses)
top_L2_classes_protein_masses = top_L2_classes_protein_masses ./ (10^(10))
top_L2_classes_protein_masses = top_L2_classes_protein_masses ./ (6.023 * 10^(13))

top_L2_rank = sortperm(top_L2_classes_protein_masses, rev=true)

mass_relative = top_L2_classes_protein_masses ./ sum(top_L2_classes_protein_masses)
[pathway_names[top_L2_rank] mass_relative[top_L2_rank]][1:20, :]
[62]:
20×2 Matrix{Any}:
 "Translation"                                   0.260538
 "Uncategorized"                                 0.209889
 "Amino Acid Biosynthesis"                       0.0760157
 "Cofactor, Carrier, and Vitamin\nBiosynthesis"  0.048382
 "Porin"                                         0.0474022
 "Glycolysis"                                    0.0387279
 "Regulation of transcription"                   0.0306598
 "Nucleoside and Nucleotide\nBiosynthesis"       0.0267107
 "TCA cycle"                                     0.0236993
 "Carbohydrate Biosynthesis"                     0.0210335
 "Proteolysis"                                   0.0202664
 "Transcription"                                 0.0197652
 "Aminoacyl-tRNA Charging"                       0.0195172
 "Protein folding"                               0.017896
 "Fatty Acid and Lipid\nBiosynthesis"            0.0162227
 "Electron Transfer Chains"                      0.0144624
 "Redox homeostasis"                             0.0136763
 "Fermentation"                                  0.00993962
 "Pentose Phosphate Pathways"                    0.00942627
 "Cell division"                                 0.00787791
[63]:
plot(bar(pathway_names[top_L2_rank][1:10], mass_relative[top_L2_rank][1:10]),
    legend = false, size=[1000,500],
    rotation=45,
    title="Protein mass fraction allocation to major cell pathways and functions",
    xlabel="Pathway/function", ylabel="Fraction of proteome",
    bottom_margin=25Plots.mm, left_margin=5Plots.mm)


# savefig("figures/protein_mass_dist_class.svg")
[63]:

Compare with Fe

[64]:
cur_elements = ["FE"]

element_idxs = [element in cur_elements for element in element_names]

C_E = C * P * E
c_e = vec(C_E[:, element_idxs])
nz_e_idxs = findall(c_e .> 0)

[c_e[nz_e_idxs] protein_names[nz_e_idxs]]

c_e_counts = counts[nz_e_idxs] .* c_e[nz_e_idxs]

# other counts already took care of complex size, magnitude, etc.
C_W = C * W * W2
C_W_normed = C_W ./ (sum(C_W, dims=2) .+ 0.0000001)

E_W_capped = C_W_normed[nz_e_idxs, :] .* c_e_counts

e_w = vec(sum(E_W_capped, dims=1))
e_w_relative = e_w ./ sum(e_w)

top_e_w_rank = sortperm(e_w, rev=true)

[pathway_names[top_e_w_rank] e_w[top_e_w_rank]][1:40, :]
[64]:
40×2 Matrix{Any}:
 "Electron Transfer Chains"                                         …       1.16422e5
 "Uncategorized"                                                       102126.0
 "Cofactor, Carrier, and Vitamin\nBiosynthesis"                         78036.1
 "Amino Acid Biosynthesis"                                              38109.7
 "TCA cycle"                                                            34794.0
 "Regulation of transcription"                                      …   33478.0
 "Redox homeostasis"                                                    18712.0
 "Secondary Metabolite\nBiosynthesis"                                   11672.0
 "Fermentation"                                                          9809.0
 "Nucleoside and Nucleotide\nBiosynthesis"                               7926.0
 "DNA repair"                                                       …    3683.0
 "Hydrogen Production"                                                   2744.0
 "Translation"                                                           2348.0
 ⋮                                                                  ⋱
 "pyrimidine\ndeoxyribonucleotides <i>de\nnovo</i> biosynthesis I"          0.0
 "N-end rule pathway I\n(prokaryotic)"                                      0.0
 "pyrimidine deoxyribonucleotide\nphosphorylation"                  …       0.0
 "tRNA charging"                                                            0.0
 "Generation of Precursor\nMetabolites and Energy"                          0.0
 "Pentose Phosphate Pathways"                                               0.0
 "pentose phosphate pathway"                                                0.0
 "pentose phosphate pathway\n(non-oxidative branch) I"              …       0.0
 "pentose phosphate pathway\n(oxidative branch) I"                          0.0
 "Fermentation to Short-Chain\nFatty Acids"                                 0.0
 "Fermentation to Butanoate"                                                0.0
 "Fermentation to Lactate"                                                  0.0
[65]:
plot(bar(pathway_names[top_e_w_rank][1:10], e_w_relative[top_e_w_rank][1:10]),
    legend = false, size=[1000,500],
    rotation=45,
    title="Iron mass allocation to major cell pathways and functions",
    xlabel="Pathway/function", ylabel="Fraction of cell iron",
    bottom_margin=25Plots.mm, left_margin=5Plots.mm)

# savefig("figures/iron_mass_dist_class.svg")
[65]:
[66]:
cur_elements = ["ZN"]

element_idxs = [element in cur_elements for element in element_names]

C_E = C * P * E
c_e = vec(C_E[:, element_idxs])
nz_e_idxs = findall(c_e .> 0)


c_e_counts = counts[nz_e_idxs] .* c_e[nz_e_idxs]

# other counts already took care of complex size, magnitude, etc.
C_W = C * W * W2
C_W_normed = C_W ./ (sum(C_W, dims=2) .+ 0.0000001)

E_W_capped = C_W_normed[nz_e_idxs, :] .* c_e_counts

e_w = vec(sum(E_W_capped, dims=1))
e_w_relative = e_w ./ sum(e_w)

top_e_w_rank = sortperm(e_w, rev=true)

[pathway_names[top_e_w_rank] e_w[top_e_w_rank]][1:40, :]
[66]:
40×2 Matrix{Any}:
 "Translation"                                                       74791.0
 "Uncategorized"                                                     40186.7
 "Glycolysis"                                                        27490.8
 "Transcription"                                                     23486.0
 "Aminoacyl-tRNA Charging"                                           23185.0
 "Nucleoside and Nucleotide\nBiosynthesis"                           22670.0
 "Cofactor, Carrier, and Vitamin\nBiosynthesis"                      20721.3
 "Regulation of transcription"                                       20124.5
 "Carbohydrate Biosynthesis"                                         19707.7
 "Alcohol Degradation"                                               15022.0
 "Polymeric Compound Degradation"                                    12485.0
 "Amino Acid Biosynthesis"                                           11476.7
 "Proteolysis"                                                        9500.49
 ⋮
 "Reactive Oxygen Species\nDegradation"                                264.0
 "Carboxylic Acid Biosynthesis"                                        167.0
 "Protein Modification"                                                163.0
 "Carboxylic Acid Degradation"                                         126.0
 "Carbohydrate Degradation"                                             81.0
 "Inorganic Nutrient Metabolism"                                        75.0
 "Transport"                                                             0.0
 "Metabolic Clusters"                                                    0.0
 "<i>O</i>-antigen building\nblocks biosynthesis (<i>E.\ncoli</i>)"      0.0
 "pyrimidine\ndeoxyribonucleotides\ndephosphorylation"                   0.0
 "pyrimidine\ndeoxyribonucleotides <i>de\nnovo</i> biosynthesis I"       0.0
 "N-end rule pathway I\n(prokaryotic)"                                   0.0
[67]:
plot(bar(pathway_names[top_e_w_rank][1:10], e_w_relative[top_e_w_rank][1:10]),
    legend = false, size=[1000,500],
    rotation=45,
    title="Zinc mass allocation to major cell pathways and functions",
    xlabel="Pathway/function", ylabel="Fraction of cell zinc",
    bottom_margin=25Plots.mm, left_margin=5Plots.mm)

savefig("figures/zinc_mass_dist_class.svg")
[67]:
"/Users/cyrus/vivarium-ecoli/notebooks/cofactors/figures/zinc_mass_dist_class.svg"
[68]:
minor_elements = ["CU", "MN"]
plot_array = []

for element in minor_elements
    cur_elements = [element]

    element_idxs = [element in cur_elements for element in element_names]

    C_E = C * P * E
    c_e = vec(C_E[:, element_idxs])
    nz_e_idxs = findall(c_e .> 0)


    c_e_counts = counts[nz_e_idxs] .* c_e[nz_e_idxs]

    # other counts already took care of complex size, magnitude, etc.
    C_W = C * W * W2
    C_W_normed = C_W ./ (sum(C_W, dims=2) .+ 0.0000001)

    E_W_capped = C_W_normed[nz_e_idxs, :] .* c_e_counts

    e_w = vec(sum(E_W_capped, dims=1))
    e_w_relative = e_w ./ sum(e_w)

    top_e_w_rank = sortperm(e_w, rev=true)

    [pathway_names[top_e_w_rank] e_w[top_e_w_rank]][1:40, :]


    p = plot(bar(pathway_names[top_e_w_rank][1:4], e_w_relative[top_e_w_rank][1:4]),
        legend = false, size=[800,400],
        rotation=45,
        title="$element allocation in cell",
        xlabel="Pathway/function", ylabel="Fraction of cell $element",
        bottom_margin=30Plots.mm, left_margin=5Plots.mm)

    push!(plot_array, p)

end

plot(plot_array..., layout=(1, 2))

# savefig("figures/cu_mn_dist_class.svg")
[68]:

metal ion per mass ratio

[69]:
cur_elements = ["FE", "ZN", "MN", "CA", "NI", "CU", "MO", "SE", "CO"]

element_idxs = [element in cur_elements for element in element_names]

# first, get mass per top 20 class
# normalized class belonging
W_comb = W * W2
W_norm = Array{Float64}(copy(W_comb))

for i in 1:size(W_norm)[1]
    W_norm[i, :] = W_norm[i, :] / (sum(W_norm[i, :]) + 0.00001)
end

# first, for cplxes without cofactor
mass_per_class = (transpose(W_norm) * transpose(Diagonal(monomer_masses)) * transpose(C) * vec(counts))
mass_per_class = Array{Float64}(mass_per_class)
mass_per_class = mass_per_class ./ (10^(10))
mass_per_class = mass_per_class ./ (6.023 * 10^(13))

top_mass_class = sortperm(mass_per_class, rev=true)
top_mass_filter = top_mass_class[1:15]

# now, get counts of transition metal ions per class
c_e_summed = vec(sum((C * P * E)[:, element_idxs], dims=2))
nz_e_idxs = findall(c_e_summed .> 0)

c_e_counts = counts[nz_e_idxs] .* c_e_summed[nz_e_idxs]

# other counts already took care of complex size, magnitude, etc.
C_W = C * W * W2
C_W_normed = C_W ./ (sum(C_W, dims=2) .+ 0.0000001)

E_W_capped = C_W_normed[nz_e_idxs, :] .* c_e_counts

e_w = vec(sum(E_W_capped, dims=1))

top_e_w_rank = sortperm(e_w, rev=true)

metal_atoms_per_protein_mass = e_w ./ (mass_per_class .* 10^15)
remove_nan = findall(isnan.(metal_atoms_per_protein_mass))
top_ratios = sortperm(metal_atoms_per_protein_mass, rev=true)

filtered_top_ratios = [item for item in top_ratios if item  remove_nan]
filtered_top_ratios = [item for item in filtered_top_ratios if item  top_mass_filter]

[pathway_names[filtered_top_ratios] metal_atoms_per_protein_mass[filtered_top_ratios]][1:10, :]
[69]:
10×2 Matrix{Any}:
 "Cofactor, Carrier, and Vitamin\nBiosynthesis"  7422.56
 "Regulation of transcription"                   4481.01
 "TCA cycle"                                     3568.59
 "Nucleoside and Nucleotide\nBiosynthesis"       3086.55
 "Carbohydrate Biosynthesis"                     3027.99
 "Aminoacyl-tRNA Charging"                       2566.21
 "Transcription"                                 2461.83
 "Glycolysis"                                    2262.1
 "Amino Acid Biosynthesis"                       2185.71
 "Uncategorized"                                 1601.68
[70]:
plot(bar(pathway_names[filtered_top_ratios][1:15], metal_atoms_per_protein_mass[filtered_top_ratios][1:15]),
    legend = false, size=[1000,500],
    rotation=45,
    title="Transition metal atoms per fg of protein, per sector",
    xlabel="Pathway/function", ylabel="Transition metal atoms / fg protein",
    bottom_margin=25Plots.mm, left_margin=5Plots.mm)

# savefig("figures/atoms_per_fg_per_sector.svg")
[70]:
[71]:
plot(bar(pathway_names[filtered_top_ratios][1:15], (mass_per_class .* 10^15)[filtered_top_ratios][1:15]),
    legend = false, size=[1000,500],
    rotation=45,
    title="fg of protein per sector",
    xlabel="Pathway/function", ylabel="fg protein",
    bottom_margin=25Plots.mm, left_margin=5Plots.mm)

# savefig("figures/protein_per_sector.svg")
[71]:
[72]:
plot(bar(pathway_names[filtered_top_ratios][1:15], e_w[filtered_top_ratios][1:15]),
    legend = false, size=[1000,500],
    rotation=45,
    title="Transition metal atoms per sector",
    xlabel="Pathway/function", ylabel="fg protein",
    bottom_margin=25Plots.mm, left_margin=5Plots.mm)

# savefig("figcolorsures/atoms_per_sector.svg")
[72]:

Areas and distribution across sectors

[73]:
total_areas = (counts_min .* protein_areas) / sum( (counts_min .* protein_areas) )
area_order = sortperm(total_areas, rev=true)

[protein_names[area_order[1:20]] total_areas[area_order[1:20]] counts_min[area_order[1:20]] protein_areas[area_order[1:20]]]
[73]:
20×4 Matrix{Any}:
 "cytochrome\n<i>bo<sub>3</sub></i>\nubiquinol:oxygen\noxidoreductase"  …  0.179996   6775   84.1249
 "multidrug efflux pump\naccessory protein AcrZ"                           0.0597384  2418   78.2291
 "30S ribosomal subunit\nbiogenesis factor LepA"                           0.0254909   987   81.7785
 "\n&alpha;-ketoglutarate:H<sup>+</sup>\nsymporter"                        0.0229106  4168   17.4052
 "NADH:quinone oxidoreductase I"                                           0.0225476   666  107.201
 "putative signal transduction\nprotein (SH3 domain)"                   …  0.019549   1222   50.6552
 "mannose-specific PTS enzyme\nIIAB component"                             0.0194774   716   86.1367
 "chemotaxis protein CheW"                                                 0.018631   5724   10.3064
 "chemotaxis protein\nmethyltransferase"                                   0.0182144  2143   26.9131
 "glucose-specific PTS enzyme II"                                          0.0148469  1410   33.3418
 "lipoprotein-28"                                                       …  0.0143852  1257   36.2371
 "HEMEOSYN-MONOMER"                                                        0.0141098  2588   17.2635
 "mannose-specific PTS enzyme II"                                          0.0138191   508   86.1367
 "thiamin triphosphate synthase"                                           0.013651    988   43.75
 "protein-glutamate\nmethylesterase/protein\nglutamine deamidase"          0.0128335  2097   19.3784
 "pyridine nucleotide\ntranshydrogenase"                                …  0.01215     877   43.8679
 "methyl-accepting chemotaxis\nprotein Tar"                                0.0118396  1133   33.0885
 "methyl-accepting chemotaxis\nprotein - dipeptide-sensing"                0.011367   1108   32.4846
 "tail-anchored inner membrane\nprotein ElaB"                              0.0108953  2470   13.9673
 "cystine/sulfocysteine:cation\nsymporter"                                 0.0108531  1345   25.5507
[74]:

# first, get mass per top 20 class # normalized class belonging W_comb = W * W2 W_norm = Array{Float64}(copy(W_comb)) for i in 1:size(W_norm)[1] W_norm[i, :] = W_norm[i, :] / (sum(W_norm[i, :]) + 0.00001) end # first, for cplxes without cofactor mass_per_class = (transpose(W_norm) * transpose(Diagonal(monomer_masses)) * transpose(C) * vec(counts)) mass_per_class = Array{Float64}(mass_per_class) mass_per_class = mass_per_class ./ (10^(10)) mass_per_class = mass_per_class ./ (6.023 * 10^(13)) top_mass_class = sortperm(mass_per_class, rev=true) top_mass_filter = top_mass_class[1:15] # now, get counts of transition metal ions per class c_e_summed = vec(sum((C * P * E)[:, element_idxs], dims=2)) nz_e_idxs = findall(c_e_summed .> 0) c_e_counts = counts[nz_e_idxs] .* c_e_summed[nz_e_idxs] # other counts already took care of complex size, magnitude, etc. C_W = C * W * W2 C_W_normed = C_W ./ (sum(C_W, dims=2) .+ 0.0000001) E_W_capped = C_W_normed[nz_e_idxs, :] .* c_e_counts e_w = vec(sum(E_W_capped, dims=1)) top_e_w_rank = sortperm(e_w, rev=true) metal_atoms_per_protein_mass = e_w ./ (mass_per_class .* 10^15) remove_nan = findall(isnan.(metal_atoms_per_protein_mass)) top_ratios = sortperm(metal_atoms_per_protein_mass, rev=true) filtered_top_ratios = [item for item in top_ratios if item remove_nan] filtered_top_ratios = [item for item in filtered_top_ratios if item top_mass_filter] [pathway_names[filtered_top_ratios] metal_atoms_per_protein_mass[filtered_top_ratios]][1:10, :]
[74]:
10×2 Matrix{Any}:
 "Cofactor, Carrier, and Vitamin\nBiosynthesis"  7422.56
 "Regulation of transcription"                   4481.01
 "TCA cycle"                                     3568.59
 "Nucleoside and Nucleotide\nBiosynthesis"       3086.55
 "Carbohydrate Biosynthesis"                     3027.99
 "Aminoacyl-tRNA Charging"                       2566.21
 "Transcription"                                 2461.83
 "Glycolysis"                                    2262.1
 "Amino Acid Biosynthesis"                       2185.71
 "Uncategorized"                                 1601.68

Fraction of active protein complexes with at least one cofactor

[75]:
cur_elements = ["FE", "ZN", "MN", "CA", "NI", "CU", "MO", "SE", "CO"]

element_idxs = [element in cur_elements for element in element_names]

# first, get indices of cplxes with trace elements
C_bin = C .> 0
P_bin = P .> 0
E_bin = E .> 0

CE_bin = C_bin * P_bin * E_bin

bin_cplx = vec(sum(CE_bin[1:length(complex_names), element_idxs], dims=2) .> 0)

# then, get fraction of mass per cplx
all_masses = transpose(Diagonal(monomer_masses)) * transpose(C) * vec(counts)
all_masses = Array{Float64}(all_masses)
all_masses = all_masses ./ (10^(10))
all_masses = all_masses ./ (6.023 * 10^(13))

# filter to cplx
sum(all_masses[1:length(complex_names)])
sum(all_masses[1:length(complex_names)][bin_cplx]) / sum(all_masses[1:length(complex_names)])
[75]:
0.20555513875940712

distributed across classes

[76]:
# normalized class belonging
W_comb = W * W2
W_norm = Array{Float64}(copy(W_comb))

for i in 1:size(W_norm)[1]
    W_norm[i, :] = W_norm[i, :] / (sum(W_norm[i, :]) + 0.00001)
end

# first, for cplxes without cofactor
C_cut = C[1:length(complex_names), :]
counts_cut = counts[1:length(complex_names)]
counts_cut_pos = counts_cut .* bin_cplx

mass_per_class = (transpose(W_norm) * transpose(Diagonal(monomer_masses)) * transpose(C_cut) * vec(counts_cut))
mass_per_class = Array{Float64}(mass_per_class)
mass_per_class = mass_per_class ./ (10^(10))
mass_per_class = mass_per_class ./ (6.023 * 10^(13))

pos_mass_per_class = (transpose(W_norm) * transpose(Diagonal(monomer_masses)) * transpose(C_cut) * vec(counts_cut_pos))
pos_mass_per_class = Array{Float64}(pos_mass_per_class)
pos_mass_per_class = pos_mass_per_class ./ (10^(10))
pos_mass_per_class = pos_mass_per_class ./ (6.023 * 10^(13))

top_mass_class = sortperm(mass_per_class, rev=true)

pos_ratio = pos_mass_per_class ./ mass_per_class

mass_relative = mass_per_class ./ sum(mass_per_class)
[pathway_names[top_mass_class] mass_per_class[top_mass_class] pos_mass_per_class[top_mass_class] pos_ratio[top_mass_class]][1:15, :]
[76]:
15×4 Matrix{Any}:
 "Translation"                                   …  4.46783e-14  0.558086
 "Amino Acid Biosynthesis"                          1.81731e-15  0.0881357
 "Uncategorized"                                    2.58773e-15  0.129334
 "Glycolysis"                                       9.93137e-16  0.0700675
 "Porin"                                            3.42698e-17  0.00308418
 "Transcription"                                 …  7.42277e-15  0.761018
 "Cofactor, Carrier, and Vitamin\nBiosynthesis"     1.16685e-15  0.121448
 "TCA cycle"                                        1.69235e-16  0.0196658
 "Nucleoside and Nucleotide\nBiosynthesis"          1.68902e-15  0.209012
 "Carbohydrate Biosynthesis"                        6.075e-16    0.0805286
 "Aminoacyl-tRNA Charging"                       …  2.24993e-15  0.34714
 "Protein folding"                                  1.40702e-16  0.0308965
 "Electron Transfer Chains"                         4.3787e-15   0.977489
 "Regulation of transcription"                      3.56045e-16  0.0794931
 "Fermentation"                                     1.58086e-15  0.395724
[77]:
length(complex_names)
[77]:
1093

misc

[78]:
findfirst(isequal("REACTIVE-OXYGEN-SPECIES-DEGRADATION"), pathway_names)
[79]:
findfirst(isequal("DETOX1-PWY"), pathway_names)
[80]:
findfirst(isequal("CARBPSYN-CPLX"), protein_names)
[ ]:

[81]:
C
[81]:
5527×4434 Matrix{Int64}:
 0  2  0  0  0  0   0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  2  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  10  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  2  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  4     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 ⋮              ⋮               ⋮     ⋱           ⋮              ⋮
 0  0  0  0  0  0   0  0  0  0  0  0  …  1  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  1  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  1  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  1  0  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  1  0  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0  …  0  0  0  0  0  1  0  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  1  0  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  1  0  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  1  0  0  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  1  0  0
 0  0  0  0  0  0   0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  1  0
 0  0  0  0  0  0   0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  1
[82]:
protein_names
[82]:
5527-element Vector{String}:
 "1-phosphofructokinase"
 "2-oxoglutarate dehydrogenase\ncomplex"
 "3-isopropylmalate\ndehydrogenase"
 "3-isopropylmalate dehydratase"
 "3-methyl-2-oxobutanoate\nhydroxymethyltransferase"
 "3-oxoacyl-[acyl carrier\nprotein] synthase 2"
 "6-phosphofructokinase 1"
 "6-phosphofructokinase 2"
 "6-phosphogluconate\ndehydrogenase, decarboxylating"
 "7-&alpha;-hydroxysteroid\ndehydrogenase"
 "8-amino-7-oxononanoate\nsynthase"
 "ferric enterobactin ABC\ntransporter"
 "iron(III) hydroxamate ABC\ntransporter"
 ⋮
 "putative ABC transporter\nmembrane subunit YphD"
 "putative ABC transporter\nATP-binding protein YphE"
 "DnaA initiator-associating\nprotein DiaA"
 "intermembrane phospholipid\ntransport system, integral\nmembrane subunit MlaE"
 "intermembrane phospholipid\ntransport system, ATP binding\nsubunit MlaF"
 "putative transport protein\nYrbG"
 "galactofuranose ABC\ntransporter periplasmic\nbinding protein"
 "galactofuranose ABC\ntransporter putative ATP\nbinding subunit"
 "galactofuranose ABC\ntransporter putative membrane\nsubunit YtfT"
 "Zn<sup>2+</sup> ABC\ntransporter periplasmic\nbinding protein"
 "Zn<sup>2+</sup> ABC\ntransporter membrane subunit"
 "Zn<sup>2+</sup> ABC\ntransporter ATP binding\nsubunit"
[83]:
counts
[83]:
5527-element Vector{Int64}:
   83
  185
 2695
 5307
  558
 1192
 1252
  135
 5327
  254
  181
  191
   84
    ⋮
    3
    0
    3
  152
  253
   51
   10
    0
   12
 1247
    1
   29

for khoa

[84]:
CSV.write("data/complex_to_class_matrix.csv",  Tables.table(C_W_normed), writeheader=true, header=unwrapped_pathway_names)
[84]:
"data/complex_to_class_matrix.csv"